Loading and arranging the data

Setup:

knitr::opts_chunk$set(echo = TRUE, max.print = 100)

Load libraries:

library(readxl)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6     ✔ purrr   0.3.4
## ✔ tibble  3.1.8     ✔ dplyr   1.0.9
## ✔ tidyr   1.2.0     ✔ stringr 1.4.0
## ✔ readr   2.1.2     ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(stringi)
library(Cairo)
library(ape)
library(geosphere)
library(Matrix)
## 
## Attaching package: 'Matrix'
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
library(ggrepel)
library(ggpubr)
## 
## Attaching package: 'ggpubr'
## 
## The following object is masked from 'package:ape':
## 
##     rotate
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(plotly)
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout
library(geodata)
## Loading required package: terra
## terra 1.5.21
## 
## Attaching package: 'terra'
## 
## The following object is masked from 'package:ggpubr':
## 
##     rotate
## 
## The following objects are masked from 'package:ape':
## 
##     rotate, trans, zoom
## 
## The following object is masked from 'package:dplyr':
## 
##     src
## 
## The following object is masked from 'package:tidyr':
## 
##     extract
## 
## The following object is masked from 'package:ggplot2':
## 
##     arrow
library(ggforce)
library(concaveman)

Load data:

Phonotacticon <- read_xlsx("Phonotacticon.xlsx", guess_max = 1661)
PhonoBib <- read_xlsx("PhonoBib.xlsx")
PanPhon <- read_csv("PanPhonPhonotacticon1_0.csv") %>% 
  filter(!duplicated(ipa))
## Rows: 7416 Columns: 23
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): ipa
## dbl (22): syl, son, cons, cont, delrel, lat, nas, strid, voi, sg, cg, ant, c...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Phonotacticon
PhonoBib
PanPhon

Arrange PanPhon segments in alphabetical order:

PanPhonOrder <- PanPhon$ipa[order(-nchar(PanPhon$ipa), 
                                                 PanPhon$ipa)]

head(PanPhonOrder)
## [1] "h͡d̪͡ɮ̪ʲʷ" "h͡d̪͡ɮ̪ʷː" "h͡d̪͡ɮ̪ʷˠ" "h͡d̪͡ɮ̪ʷˤ" "h͡d̪͡z̪ʲʷ" "h͡d̪͡z̪ʷː"

Create PanPhon regex:

PanPhonRegex <- paste0("(?:",
                       paste(PanPhonOrder, collapse="|"),
                       ")")

str_trunc(PanPhonRegex, 100)
## [1] "(?:h͡d̪͡ɮ̪ʲʷ|h͡d̪͡ɮ̪ʷː|h͡d̪͡ɮ̪ʷˠ|h͡d̪͡ɮ̪ʷˤ|h͡d̪͡z̪ʲʷ|h͡d̪͡z̪ʷː|h͡d̪͡z̪ʷˠ|h͡d̪͡z̪ʷˤ|h͡t̪͡ɬ̪ʲʷ|h͡t̪..."

Subset Eurasian lects that are complete in Phonotacticon:

Eurasia <- Phonotacticon %>% 
  filter(Complete,
         complete.cases(P),
         O != "?",
         N != "?",
         C != "?",
         !grepl('[A-Z]|Č|Ł|\\[', O),
         !grepl('[A-Z]|Č|Ł|\\[', N),
         !grepl('[A-Z]|Č|Ł|\\[', C)
         )

Eurasia

Extract onset, nucleus, and coda sequences:

Sequences <- Eurasia %>% 
  select(Lect, O, N, C) %>% 
  pivot_longer(-Lect, names_to = 'Category', values_to = 'Sequence') %>% 
  separate(col = Sequence, 
           sep = ' ', 
           into = as.character(1:500),
           fill = 'right') %>% 
  pivot_longer(-c(Lect, Category), names_to = 'Number', values_to = 'Sequence') %>% 
  select(-Number) %>% 
  filter(!is.na(Sequence)) %>% 
  distinct()

Sequences

Split the sequences into segments:

Segments <- stri_extract_all_regex(Sequences$Sequence, 
                         pattern = PanPhonRegex,
                         simplify = TRUE) %>% 
  as_tibble(.name_repair = 'unique') %>% 
  mutate(Lect = Sequences$Lect,
         Category = Sequences$Category,
         Sequence = Sequences$Sequence) %>% 
  pivot_longer(cols = -c(Lect, Category, Sequence),
               names_to = 'Order',
               values_to = 'ipa') %>% 
  mutate(Order = Order %>% 
           as.factor() %>% 
           as.integer()) %>% 
  filter(ipa != "")
## New names:
## • `` -> `...1`
## • `` -> `...2`
## • `` -> `...3`
## • `` -> `...4`
## • `` -> `...5`
Segments

Measure the length of each sequence:

Sequences_length <- Segments %>% 
  group_by(Lect, Category, Sequence) %>% 
  summarise(Length = max(Order))
## `summarise()` has grouped output by 'Lect', 'Category'. You can override using
## the `.groups` argument.
Sequences_length

Join the length of each sequence to segments:

Segments <- left_join(Segments, Sequences_length)
## Joining, by = c("Lect", "Category", "Sequence")
Segments

Count the maximal length:

MaxLength <- max(Sequences_length$Length)

MaxLength
## [1] 5

Count the number of split segments:

Segments_number <- nrow(Segments)

Segments_number
## [1] 9814

Assign different positions to each sequence:

Sequences_rep <- bind_rows(rep(list(Segments), 5))

Sequences_rep <- Sequences_rep %>% 
  mutate(Position = rep(0:(MaxLength - 1),
                        each = Segments_number)) %>% 
  mutate(Order = Order + Position) %>% 
  filter(Length + Position <= MaxLength) %>% 
  select(-Length)

Sequences_rep

Calculating the phonological distance

Join segments with their phonological features:

Sequences_features <- Sequences_rep %>% 
  left_join(PanPhon, by = 'ipa') %>% 
  pivot_longer(cols = -c(Lect, 
                         Category, 
                         Sequence, 
                         Order, 
                         ipa, 
                         Position),
               names_to = 'Feature',
               values_to = 'Value') %>% 
  mutate(Feature = paste0(Feature, Order)) %>% 
  pivot_wider(names_from = 'Feature',
              values_from = 'Value') %>% 
  select(-Order, -ipa) %>% 
  pivot_longer(cols = -c(Lect, Category, Sequence, Position),
               names_to = 'Feature',
               values_to = 'Value') %>% 
  filter(!is.na(Value)) %>% 
  pivot_wider(names_from = 'Feature',
              values_from = 'Value') %>% 
  replace(is.na(.), 0)

Sequences_features

Paste sequence and position together:

Sequences_SeqPos <- Sequences_features %>%
  mutate(SequencePosition = paste0(Sequence, Position)) %>% 
  select(-Lect, -Category, -Position, -Sequence) %>% 
  select(SequencePosition, everything()) %>% 
  distinct()

Sequences_SeqPos

Calculate the distance between segments:

Sequences_distance <- Sequences_SeqPos %>% 
  select(-SequencePosition) %>%  
  dist(method = 'euclidean') %>% 
  as.matrix() %>%
  as_tibble(.name_repair = 'unique')

Sequences_distance <- bind_cols(Sequences_SeqPos$SequencePosition,
                                Sequences_distance)
## New names:
## • `` -> `...1`
colnames(Sequences_distance) <- c('SequencePosition',
                                 Sequences_SeqPos$SequencePosition)

Sequences_distance

Find the minimal distance among the different positions:

Sequences_MinDistance <- Sequences_distance %>% 
  pivot_longer(-SequencePosition, 
               names_to = 'Sequence.y', 
               values_to = 'Distance') %>% 
  mutate(Sequence.x = str_remove(SequencePosition, '[0-9]')) %>% 
  select(-SequencePosition) %>% 
  mutate(Sequence.y = str_remove(Sequence.y, '[0-9]')) %>% 
  group_by(Sequence.x, Sequence.y) %>% 
  summarise(Distance = min(Distance))
## `summarise()` has grouped output by 'Sequence.x'. You can override using the
## `.groups` argument.
Sequences_MinDistance

Subset the distance of /pl/ and other segments:

pl <- Sequences_MinDistance %>% 
  filter(Sequence.x == 'pl',
         Sequence.y != '∅') %>% 
  arrange(Distance)

pl

Subset the distance of /ia/ and other segments:

ia <- Sequences_MinDistance %>% 
  filter(Sequence.x == 'ia',
         Sequence.y != '∅') %>% 
  arrange(Distance)

ia

Compare the difference between two lects within the same category:

ONC_distance <- Sequences %>% 
  left_join(Sequences, by = c('Category')) %>%
  left_join(Sequences_MinDistance, 
            by = c('Sequence.x', 'Sequence.y')) %>% 
  group_by(Lect.x, Category, Sequence.x, Lect.y) %>% 
  summarize(Distance = min(Distance)) %>% 
  group_by(Lect.x, Category, Lect.y) %>% 
  summarize(Distance = mean(Distance)) %>% 
  mutate(Lect_vs_Lect = str_c(pmin(Lect.x, Lect.y),
                             "vs.",
                              pmax(Lect.x, Lect.y),
                              sep = " ")) %>% 
  group_by(Lect_vs_Lect, Category) %>% 
  mutate(Distance = max(Distance)) %>% 
  ungroup() %>% 
  select(Lect_vs_Lect, Category, Distance) %>% 
  distinct()
## `summarise()` has grouped output by 'Lect.x', 'Category', 'Sequence.x'. You can
## override using the `.groups` argument.
## `summarise()` has grouped output by 'Lect.x', 'Category'. You can override
## using the `.groups` argument.
ONC_distance

Count the number of tones in each lect:

Tones <- Eurasia %>%
  select(Lect, `T`) %>%
  mutate(`T` = gsub("\\-", NA, `T`)) %>%
  mutate(`T` = str_count(`T`, " ") + 1)

Tones[is.na(Tones$`T`),]$`T` <- 0

Tones

Calculate the Canberra distance between the numbers of tones of each pair of lects:

Tones_distance <- dist(Tones, method = 'canberra')
## Warning in dist(Tones, method = "canberra"): NAs introduced by coercion
Tones_distance[is.na(Tones_distance)] <- 0
Tones_distance <- Tones_distance %>%
    as.matrix() %>%
    as_tibble(.name_repair = 'unique')

Tones_distance <- bind_cols(Tones$Lect, Tones_distance)
## New names:
## • `` -> `...1`
colnames(Tones_distance) <- c('Lect', Tones$Lect)

Tones_distance <- Tones_distance %>%
    pivot_longer(-Lect, names_to = 'Lect2', values_to = 'Distance') %>%
    mutate(Lect_vs_Lect = str_c(pmin(Lect, Lect2),
                                "vs.",
                                pmax(Lect, Lect2),
                               sep = " "),
           Category = 'T') %>%
    select(Lect_vs_Lect, Category, Distance) %>%
    distinct()

Tones_distance

Bind segmental and tonal distances:

ONCT_distance <- rbind(ONC_distance,
                        Tones_distance) %>%
                  pivot_wider(names_from = Category,
                  values_from = Distance)

ONCT_distance

Normalize the distances:

ONCT_distance[, c('O', 'N', 'C', 'T')] <-
      scale(ONCT_distance[, c('O', 'N', 'C', 'T')])

ONCT_distance

Calculate the mean of the distances and obtain the phonological distances:

PhonoDist <- ONCT_distance %>%
  mutate(Distance = 
               rowMeans(ONCT_distance[, c('O', 'N', 'C', 'T')])) %>%
  mutate(Distance = Distance - min(Distance)) %>% 
  select(Lect_vs_Lect, Distance)

PhonoDist

Split the Lect_vs_Lect column of PhonoDist into Lect.x and Lect.y:

Lect_vs_Lect <- str_split_fixed(
  PhonoDist$Lect_vs_Lect, ' vs. ', n = 2) %>% 
  as_tibble(.name_repair = 'unique')
## New names:
## • `` -> `...1`
## • `` -> `...2`
colnames(Lect_vs_Lect) <- c('Lect.x', 'Lect.y')

Lect_vs_Lect

Visualize the phonological distances via multidimensional scaling:

PhonoScale <- PhonoDist %>% 
  bind_cols(Lect_vs_Lect) %>% 
  select(-Lect_vs_Lect) %>% 
  pivot_wider(names_from = Lect.y,
              values_from = Distance) %>% 
  select(-Lect.x) %>% 
  t() %>% 
  as.dist() %>% 
  cmdscale(k = 5) %>% 
  as.data.frame() %>% 
  rownames_to_column('Lect') %>% 
  as_tibble()
  
PhonoScale

Cluster PhonoScale into five groups:

K5 <- PhonoScale %>% 
  select(-Lect) %>% 
  kmeans(5)

K5
## K-means clustering with 5 clusters of sizes 19, 13, 19, 13, 24
## 
## Cluster means:
##           V1         V2         V3          V4           V5
## 1 -1.2502795 -0.1983147  0.1306058 -0.02328731 -0.006885694
## 2  0.2424814 -0.5709140  0.4698632  0.09300284 -0.143725762
## 3 -0.5600501  0.6182668 -0.1955981 -0.13741819  0.156174419
## 4  1.1343901  0.4310772  0.4133609  0.18151592  0.101243472
## 5  0.6873722 -0.2567171 -0.4269609 -0.02147248 -0.095175667
## 
## Clustering vector:
##  [1] 1 1 2 5 3 5 4 3 2 5 5 5 5 5 2 3 1 3 1 5 1 5 5 2 4 3 1 1 5 3 1 4 1 3 4 1 3 1
## [39] 5 1 5 1 4 5 5 3 3 5 2 4 1 3 2 5 4 3 2 2 3 4 2 1 5 1 1 2 3 5 3 5 2 4 5 3 4 4
## [77] 2 5 4 1 3 1 5 4 5 2 3 3
## 
## Within cluster sum of squares by cluster:
## [1]  8.657499  7.095161 19.225226  5.672026  5.226601
##  (between_SS / total_SS =  67.0 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

Add cluster to PhonoScale:

PhonoScaleK <- PhonoScale %>% 
  mutate(Cluster = as.factor(K5$cluster))

PhonoScaleK

Visualize the multidimensional scaling (V1 and V2):

PhonoScalePlot <- ggplot(PhonoScaleK,
                         aes(x = V2, 
                             y = V1,
                             shape = Cluster,
                             fill = Cluster)) +
                  geom_point() +
                  geom_label_repel(aes(label = Lect),
                                   size = 2,
                    min.segment.length = 0,
                    force = 50,
                    show.legend = FALSE,
                    max.overlaps = 100) +
  scale_shape_manual(values = 21:25) +
  scale_fill_brewer(palette = 'Pastel1') +
  theme_classic() +
  coord_fixed()

cairo_pdf(file = 'PhonoScalePlot.pdf', 
          width = 7, 
          height = 8, 
          family = 'Times New Roman')
PhonoScalePlot
dev.off()
## quartz_off_screen 
##                 2
PhonoScalePlot

3D multidimensional scaling:

PhonoScale3D <- plot_ly(PhonoScaleK,
        x = ~V1,
        y = ~V2,
        z = ~V3,
        type = 'scatter3d',
        mode = 'text',
        text = ~Lect,
        color  = ~Cluster,
        textfont = list(size = 20)) %>% 
  layout(showlegend = FALSE)

PhonoScale3D

Testing the correlation between phonological and geographical distances

Calculate the geographical distances between each pair of lects:

Lect_LonLat <- PhonoBib %>% 
  filter(Lect %in% Eurasia$Lect) %>% 
  select(Lect, lon, lat)

Lect_Kilometers <- Lect_LonLat %>% 
  mutate(Kilometers = 'Kilometers')

Coordinates <- Lect_Kilometers %>% 
  full_join(Lect_Kilometers, by = 'Kilometers')

Coordinates <- Coordinates %>% 
  mutate(Lect_vs_Lect = str_c(pmin(Lect.x, Lect.y),
                             "vs.",
                              pmax(Lect.x, Lect.y),
                              sep = " "))

Coordinates.x <- select(Coordinates, lon.x, lat.x)
Coordinates.y <- select(Coordinates, lon.y, lat.y)

GeoDist <- Coordinates %>% 
  mutate(Kilometers = 
           distHaversine(Coordinates.x, Coordinates.y) / 1000) %>% 
  select(Lect_vs_Lect, Kilometers) %>% 
  distinct()

GeoDist

Join the phonological distances and geographical distances:

PhonoGeoDist <- full_join(PhonoDist, GeoDist, by = 'Lect_vs_Lect')

PhonoGeoDist

Filter out the distance of a lect from itself:

PhonoGeoDist <- PhonoGeoDist %>% 
  filter(!(grepl('(.*) vs\\. \\1', Lect_vs_Lect)))

PhonoGeoDist

Linear regression between geographical distance and phonological distances:

PhonoGeoDist %>% 
  lm(formula = Distance ~ Kilometers) %>% 
  summary()
## 
## Call:
## lm(formula = Distance ~ Kilometers, data = .)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.75128 -0.37079  0.01159  0.40279  1.65246 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1.820e+00  1.559e-02  116.76   <2e-16 ***
## Kilometers  5.465e-05  3.624e-06   15.08   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5566 on 3823 degrees of freedom
## Multiple R-squared:  0.05616,    Adjusted R-squared:  0.05591 
## F-statistic: 227.5 on 1 and 3823 DF,  p-value: < 2.2e-16
PhonoGeoDist

Visualize the linear regression:

PhonoLM <- 
  ggplot(aes(x = Kilometers, y = Distance), data = PhonoGeoDist) +
  geom_point(alpha = 0.1) +
  geom_smooth(formula = y ~ x, method = 'lm', color = 'red') +
  theme_classic() +
  scale_x_continuous(name = 'Geographical distance (km)') +
  scale_y_continuous(name = 'Phonological distance (z)')

cairo_pdf(file = 'PhonoLM.pdf', 
          width = 4, 
          height = 3, 
          family = 'Times New Roman')
PhonoLM
dev.off()
## quartz_off_screen 
##                 2
PhonoLM

#Visualizing distances as lines

Subset lects and families from PhonoBib:

LectFamily <- PhonoBib %>% 
  select(Lect, Family)

LectFamily

Group lects together into pairs:

PhonoPair <- PhonoDist %>% 
  bind_cols(Lect_vs_Lect) %>% 
  left_join(PhonoGeoDist) %>% 
  filter(!is.na(Kilometers)) %>% 
  left_join(LectFamily, by = c('Lect.x' = 'Lect')) %>% 
  left_join(LectFamily, by = c('Lect.y' = 'Lect')) %>%
  mutate(Related = Family.x == Family.y,
           Lect_vs_Lect = as.factor(Lect_vs_Lect) %>% 
           as.integer()) %>% 
  select(-Family.x, -Family.y) %>% 
  pivot_longer(cols = c(Lect.x, Lect.y),
               names_to = 'xy',
               values_to = 'Lect') %>% 
  select(-xy) %>% 
  left_join(Lect_LonLat)
## Joining, by = c("Lect_vs_Lect", "Distance")
## Joining, by = "Lect"
PhonoPair
PhonoNear <- PhonoPair %>% 
  filter(Kilometers < 1500,
         Distance < quantile(Distance, 0.25))

Load map data:

map <- map_data("world")

head(world)
##                                                                  
## 1 function (resolution = 5, level = 0, path, version = "latest", 
## 2     ...)                                                       
## 3 {                                                              
## 4     stopifnot(level[1] == 0)                                   
## 5     resolution = round(resolution[1])                          
## 6     stopifnot(resolution %in% 1:5)

Create a map of Eurasia:

EurasiaMap <- ggplot(map, aes(x = long, y = lat)) + 
  geom_polygon(aes(group = group),
               fill = "white", 
               color = "darkgrey", 
               size = 0.2) +
  coord_map("ortho",
            orientation = c(20, 70, 0),
            xlim = c(10, 130),
            ylim = c(0, 90)) +
  theme_void()

EurasiaMap

PhonoLine <- EurasiaMap +
  geom_line(aes(x = lon,
                y = lat,
                group = Lect_vs_Lect,
                color = Related,
                linetype = Related,
                alpha = -Distance,
                size = -Distance),
            data = PhonoNear) +
  geom_point(aes(x = lon,
                 y = lat),
             shape = 21,
             fill = 'white',
             data = PhonoNear) +
  scale_size_continuous(range = c(0.5, 2)) +
  scale_alpha_continuous(range = c(0.1, 1))

PhonoLine